##### ####################################################
#####                                               ######
#####       Prepare feature-co-occurrence matrix
#####                                               ######
##### ####################################################

# init ------------------------------------------------------------

rm(list=ls())
range01 <- function(x){(x-min(x))/(max(x)-min(x))}
trim <- function(s) gsub("^[[:space:]]+|[[:space:]]+$","",s)

# Load libraries

library(quanteda) # CRAN v3.0.0
library(quanteda.dictionaries) # [github::kbenoit/quanteda.dictionaries] v0.22
library(tidyverse) # CRAN 1.3.0
set.seed(221186)

quanteda_options(threads = 3)

# Load data

load("data/debates.Rdata")

boiler_plate <- c("clause", "schedule", "section", "amendment", "petition", "e-petition", "article", "paragraph", "act", "bill")
jargon  <- tolower(trim(read.csv("data/dictionaries/parliamentary_jargon.csv", stringsAsFactors = FALSE)[,1]))

debates <- debates %>% as_tibble() %>%
  mutate(body = gsub(paste0(boiler_plate, " \\d+", collapse = "|"), "parlboilerplate", body, ignore.case = T)) %>%
  mutate(body = gsub("\\d+|\\d+,\\d+|\\d+.\\d+", "000", body, ignore.case = T)) %>%
  mutate(body = gsub(paste0(jargon, collapse = "|"), "parliamentary_jargon", body, ignore.case = T)) %>%
  mutate(question_put_agreed = as.numeric(regexpr("question put and agreed to", body, ignore.case = T))) %>%
  mutate(body = substring(body, 1, ifelse(question_put_agreed== -1, nchar(body), question_put_agreed-1)))

debate_tokens <- debates %>% 
  corpus(text_field = "body") %>% 
  tokens(remove_punct = TRUE) %>%
  tokens_tolower() %>% 
  tokens_compound(data_dictionary_LSD2015)

debate_feats <- dfm(debate_tokens, verbose = TRUE)

# Exclude words appearing in fewer than 0.02% of speeches or more than 90% of speeches; exclude stopwords

debate_feats_small <- debate_feats %>% 
  dfm_trim(min_docfreq = .0002, max_docfreq = .90, docfreq_type = "prop") %>%
  dfm_select(pattern = stopwords("en"), selection = "remove") %>%
  featnames()

debate_tokens_small <- tokens_remove(debate_tokens, 
                                     pattern = featnames(debate_feats)[!featnames(debate_feats) %in% debate_feats_small], 
                                     padding = TRUE)

## Remove unnecessary objects

rm(debate_tokens, debates)

## Covert to feature-context matrix

debate_fcm <- fcm(debate_tokens_small, 
                  context = "window", 
                  count = "weighted", 
                  window = 6, 
                  weights = 1 / (1:6), 
                  tri = TRUE)

## Save output

save(debate_fcm, file = "working/debate_fcm.Rdata")
